# coding: utf-8

# In[318]:


#Load the librarys
import pandas as pd #To work with dataset
import numpy as np #Math library
import seaborn as sns #Graph library that use matplot in background
import matplotlib.pyplot as plt #to plot some parameters in seaborn

#Importing the data
df_credit = pd.read_csv("C:/Users/Kai/Desktop/Assignment3/data/credit-g_preproccess.csv",index_col=0)


# In[319]:


# First Look at the data:
## Looking the Type of Data
## Null Numbers
## Unique values

df_credit.head(6)


# In[320]:


#Searching for Missings,type of data and also known the shape of data
print(df_credit.info())

# Let us check if there is any null values
print(df_credit.isnull().sum())

df_credit.shape


# In[321]:


#Looking unique values
print(df_credit.nunique())


# In[322]:


# Transforming the data into Dummy variables (IMPORTANT)
def one_hot_encoder(df, nan_as_category = False):
    original_columns = list(df.columns)
    categorical_columns = [col for col in df.columns if df[col].dtype == 'object']
    df = pd.get_dummies(df, columns= categorical_columns, dummy_na= nan_as_category, drop_first=True)
    new_columns = [c for c in df.columns if c not in original_columns]
    return df, new_columns

df_credit, new_colunms = one_hot_encoder(df_credit)
df_credit.head(6)


# In[323]:


print(df_credit.info())


# In[324]:


new_colunms


# In[325]:


#Purpose to Dummies Variable
# df_credit = df_credit.merge(pd.get_dummies(df_credit.purpose, drop_first=True, prefix='purpose'), left_index=True, right_index=True)


# In[326]:


plt.figure(figsize=(20,18))
sns.heatmap(df_credit.astype(float).corr(),linewidths=0.1,vmax=1.0, 
            square=True,  linecolor='white', annot=True)
plt.show()


# In[327]:


from sklearn.model_selection import train_test_split, KFold, cross_val_score # to split the data
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, fbeta_score #To evaluate our model

from sklearn.model_selection import GridSearchCV

# Algorithmns models to be compared
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC


#Creating the X and y variables
X = df_credit.drop('class_good', 1).values
y = df_credit["class_good"].values


# In[328]:


# Spliting X and y into train and test version
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)
# to feed the random state
seed = 7

# prepare models
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('RF', RandomForestClassifier()))
models.append(('SVM', SVC(gamma='auto')))

# evaluate each model in turn
def pltFoldMethodsResult(scor):
    results = []
    names = []
    scoring = scor

    for name, model in models:
            kfold = KFold(n_splits=10, random_state=seed)
            cv_results = cross_val_score(model, X_train, y_train, cv=kfold, scoring=scoring)
            results.append(cv_results)
            names.append(name)
            msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
            print(msg)

    # boxplot algorithm comparison
    fig = plt.figure(figsize=(11,6))
    fig.suptitle('Algorithm Comparison')
    ax = fig.add_subplot(111)
    plt.boxplot(results)
    ax.set_xticklabels(names)
    plt.ylabel(scor)
    plt.show()
    

# In[329]:


pltFoldMethodsResult('accuracy')


# In[330]:


pltFoldMethodsResult('recall')


# In[331]:


pltFoldMethodsResult('precision')


# In[332]:


pltFoldMethodsResult('f1')


# In[333]:


pltFoldMethodsResult('roc_auc')


# In[334]:


from sklearn.utils import resample
from sklearn.metrics import roc_curve
# Criando o classificador logreg
GNB = GaussianNB()

# Fitting with train data
model = GNB.fit(X_train, y_train)
# Printing the Training Score
print("Training score data: ")
print(model.score(X_train, y_train))

y_pred = model.predict(X_test)

print(accuracy_score(y_test,y_pred))
print("\n")
print(confusion_matrix(y_test, y_pred))
print("\n")
print(classification_report(y_test, y_pred))

#Predicting proba
model.predict_proba(X_test)[:,1]
y_pred_prob = model.predict_proba(X_test)[:,1]

# Generate ROC curve values: fpr, tpr, thresholds
fpr, tpr, thresholds = roc_curve(y_test, y_pred_prob)

# Plot ROC curve
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.show()